{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {
    "slideshow": {
     "slide_type": "skip"
    }
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Populating the interactive namespace from numpy and matplotlib\n",
      "directory /home/jovyan/work/Sections/Data/Weather already exists\n",
      "removing /home/jovyan/work/Sections/Data/Weather/NY.parquet\n",
      "removing /home/jovyan/work/Sections/Data/Weather/NY.tgz\n",
      "wget https://mas-dse-open.s3.amazonaws.com/Weather/by_state/NY.tgz -P /home/jovyan/work/Sections/Data/Weather \n",
      "--2018-04-09 01:16:40--  https://mas-dse-open.s3.amazonaws.com/Weather/by_state/NY.tgz\n",
      "Resolving mas-dse-open.s3.amazonaws.com (mas-dse-open.s3.amazonaws.com)... 52.218.196.178\n",
      "Connecting to mas-dse-open.s3.amazonaws.com (mas-dse-open.s3.amazonaws.com)|52.218.196.178|:443... connected.\n",
      "HTTP request sent, awaiting response... 200 OK\n",
      "Length: 23182008 (22M) [application/x-tar]\n",
      "Saving to: ‘/home/jovyan/work/Sections/Data/Weather/NY.tgz’\n",
      "\n",
      "NY.tgz              100%[===================>]  22.11M   279KB/s    in 65s     \n",
      "\n",
      "2018-04-09 01:17:46 (347 KB/s) - ‘/home/jovyan/work/Sections/Data/Weather/NY.tgz’ saved [23182008/23182008]\n",
      "\n",
      "-rwxrwxrwx 1 jovyan staff 23M Mar 16 20:25 /home/jovyan/work/Sections/Data/Weather/NY.tgz\n",
      "NY.parquet/\n",
      "NY.parquet/_SUCCESS\n",
      "NY.parquet/part-00000-8342bcf4-7fc2-4183-8e11-aefdb4915fbb-c000.snappy.parquet\n",
      "NY.parquet/part-00001-8342bcf4-7fc2-4183-8e11-aefdb4915fbb-c000.snappy.parquet\n",
      "NY.parquet/part-00002-8342bcf4-7fc2-4183-8e11-aefdb4915fbb-c000.snappy.parquet\n",
      "NY.parquet/part-00003-8342bcf4-7fc2-4183-8e11-aefdb4915fbb-c000.snappy.parquet\n",
      "NY.parquet/part-00004-8342bcf4-7fc2-4183-8e11-aefdb4915fbb-c000.snappy.parquet\n",
      "NY.parquet/part-00005-8342bcf4-7fc2-4183-8e11-aefdb4915fbb-c000.snappy.parquet\n",
      "NY.parquet/part-00006-8342bcf4-7fc2-4183-8e11-aefdb4915fbb-c000.snappy.parquet\n",
      "NY.parquet/part-00007-8342bcf4-7fc2-4183-8e11-aefdb4915fbb-c000.snappy.parquet\n",
      "NY.parquet/part-00008-8342bcf4-7fc2-4183-8e11-aefdb4915fbb-c000.snappy.parquet\n",
      "NY.parquet/part-00009-8342bcf4-7fc2-4183-8e11-aefdb4915fbb-c000.snappy.parquet\n",
      "NY.parquet/part-00010-8342bcf4-7fc2-4183-8e11-aefdb4915fbb-c000.snappy.parquet\n",
      "NY.parquet/part-00011-8342bcf4-7fc2-4183-8e11-aefdb4915fbb-c000.snappy.parquet\n",
      "NY.parquet/part-00012-8342bcf4-7fc2-4183-8e11-aefdb4915fbb-c000.snappy.parquet\n",
      "NY.parquet/part-00013-8342bcf4-7fc2-4183-8e11-aefdb4915fbb-c000.snappy.parquet\n",
      "/home/jovyan/work/Sections/Data/Weather/NY.parquet\n",
      "+-----------+-----------+----+--------------------+-----+\n",
      "|    Station|Measurement|Year|              Values|State|\n",
      "+-----------+-----------+----+--------------------+-----+\n",
      "|USC00303452|       PRCP|1903|[00 7E 00 7E 00 7...|   NY|\n",
      "+-----------+-----------+----+--------------------+-----+\n",
      "only showing top 1 row\n",
      "\n"
     ]
    }
   ],
   "source": [
    "#Initialize and load weather dataframe\n",
    "\n",
    "from pyspark import SparkContext\n",
    "sc = SparkContext(master=\"local[4]\")\n",
    "#sc.version\n",
    "\n",
    "import os\n",
    "import sys\n",
    "\n",
    "from pyspark import SparkContext\n",
    "from pyspark.sql import SQLContext\n",
    "from pyspark.sql.types import Row, StructField, StructType, StringType, IntegerType\n",
    "%pylab inline\n",
    "\n",
    "# Just like using Spark requires having a SparkContext, using SQL requires an SQLContext\n",
    "sqlContext = SQLContext(sc)\n",
    "\n",
    "from os.path import split,join,exists\n",
    "from os import mkdir,getcwd,remove\n",
    "from glob import glob\n",
    "\n",
    "# create directory if needed\n",
    "\n",
    "notebook_dir=getcwd()\n",
    "data_dir=join(split(split(notebook_dir)[0])[0],'Data')\n",
    "weather_dir=join(data_dir,'Weather')\n",
    "\n",
    "if exists(weather_dir):\n",
    "    print('directory',weather_dir,'already exists')\n",
    "else:\n",
    "    print('making',weather_dir)\n",
    "    mkdir(weather_dir)\n",
    "\n",
    "file_index='NY'\n",
    "zip_file='%s.tgz'%(file_index) #the .csv extension is a mistake, this is a pickle file, not a csv file.\n",
    "old_files='%s/%s*'%(weather_dir,zip_file[:-3])\n",
    "for f in glob(old_files):\n",
    "    print('removing',f)\n",
    "    !rm -rf {f}\n",
    "\n",
    "command=\"wget https://mas-dse-open.s3.amazonaws.com/Weather/by_state/%s -P %s \"%(zip_file, weather_dir)\n",
    "print(command)\n",
    "!$command\n",
    "!ls -lh $weather_dir/$zip_file\n",
    "\n",
    "#extracting the parquet file\n",
    "!tar zxvf {weather_dir}/{zip_file} -C {weather_dir}\n",
    "\n",
    "weather_parquet = join(weather_dir,zip_file[:-3]+'parquet')\n",
    "print(weather_parquet)\n",
    "df = sqlContext.read.load(weather_parquet)\n",
    "df.show(1)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "slideshow": {
     "slide_type": "slide"
    }
   },
   "source": [
    "## Dataframe operations\n",
    "Spark DataFrames allow operations similar to pandas Dataframes. We demonstrate some of those.\n",
    "\n",
    "For more, see the [official guide](https://spark.apache.org/docs/latest/sql-programming-guide.html) and [this article](https://www.analyticsvidhya.com/blog/2016/10/spark-dataframe-and-operations/)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {
    "slideshow": {
     "slide_type": "subslide"
    }
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "root\n",
      " |-- Station: string (nullable = true)\n",
      " |-- Measurement: string (nullable = true)\n",
      " |-- Year: integer (nullable = true)\n",
      " |-- Values: binary (nullable = true)\n",
      " |-- State: string (nullable = true)\n",
      "\n"
     ]
    }
   ],
   "source": [
    "df.printSchema()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {
    "slideshow": {
     "slide_type": "subslide"
    }
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "84199\n",
      "+-----------+-----------+----+--------------------+-----+\n",
      "|    Station|Measurement|Year|              Values|State|\n",
      "+-----------+-----------+----+--------------------+-----+\n",
      "|USC00303452|       PRCP|1903|[00 7E 00 7E 00 7...|   NY|\n",
      "+-----------+-----------+----+--------------------+-----+\n",
      "only showing top 1 row\n",
      "\n"
     ]
    }
   ],
   "source": [
    "print(df.count())\n",
    "df.show(1)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "slideshow": {
     "slide_type": "subslide"
    }
   },
   "source": [
    "### .describe()\n",
    "The method `df.describe()` computes five statistics for each column of the dataframe `df`.\n",
    "\n",
    "The statistics are: **count, mean, std, min,max**"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "slideshow": {
     "slide_type": "skip"
    }
   },
   "source": [
    "You get the following man page using the command `df.describe?`\n",
    "\n",
    "```\n",
    "Signature: df.describe(*cols)\n",
    "Docstring:\n",
    "Computes statistics for numeric and string columns.\n",
    "\n",
    "This include count, mean, stddev, min, and max. If no columns are\n",
    "given, this function computes statistics for all numerical or string columns.\n",
    "\n",
    ".. note:: This function is meant for exploratory data analysis, as we make no\n",
    "    guarantee about the backward compatibility of the schema of the resulting DataFrame.\n",
    "\n",
    ">>> df.describe(['age']).show()\n",
    "+-------+------------------+\n",
    "|summary|               age|\n",
    "+-------+------------------+\n",
    "|  count|                 2|\n",
    "|   mean|               3.5|\n",
    "| stddev|2.1213203435596424|\n",
    "|    min|                 2|\n",
    "|    max|                 5|\n",
    "+-------+------------------+\n",
    ">>> df.describe().show()\n",
    "+-------+------------------+-----+\n",
    "|summary|               age| name|\n",
    "+-------+------------------+-----+\n",
    "|  count|                 2|    2|\n",
    "|   mean|               3.5| null|\n",
    "| stddev|2.1213203435596424| null|\n",
    "|    min|                 2|Alice|\n",
    "|    max|                 5|  Bob|\n",
    "+-------+------------------+-----+\n",
    "\n",
    ".. versionadded:: 1.3.1\n",
    "File:      ~/spark-2.2.1-bin-hadoop2.7/python/pyspark/sql/dataframe.py\n",
    "Type:      method\n",
    "```"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {
    "scrolled": false,
    "slideshow": {
     "slide_type": "subslide"
    }
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "+-----------+-----------+\n",
      "|    station|measurement|\n",
      "+-----------+-----------+\n",
      "|      84199|      84199|\n",
      "|       null|       null|\n",
      "|       null|       null|\n",
      "|USC00300015|       PRCP|\n",
      "|USW00094794|       TOBS|\n",
      "+-----------+-----------+\n",
      "\n"
     ]
    }
   ],
   "source": [
    "df.describe().select('station','measurement').show()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "slideshow": {
     "slide_type": "subslide"
    }
   },
   "source": [
    "#### groupby and agg\n",
    "The method `.groupby(col)` groups rows according the value of the column `col`.  \n",
    "The method `.agg(spec)` computes a summary for each group as specified in `spec`"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {
    "scrolled": true,
    "slideshow": {
     "slide_type": "subslide"
    }
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "+-----------+---------+--------------+\n",
      "|measurement|min(year)|count(station)|\n",
      "+-----------+---------+--------------+\n",
      "|       TMIN|     1873|         13442|\n",
      "|       TOBS|     1876|         10956|\n",
      "|       TMAX|     1873|         13437|\n",
      "|       SNOW|     1884|         15629|\n",
      "|       SNWD|     1888|         14617|\n",
      "|       PRCP|     1871|         16118|\n",
      "+-----------+---------+--------------+\n",
      "\n"
     ]
    }
   ],
   "source": [
    "df.groupby('measurement').agg({'year': 'min', 'station':'count'}).show()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {
    "slideshow": {
     "slide_type": "skip"
    }
   },
   "outputs": [],
   "source": [
    "# THis command will load the python module that defines the SQL functions\n",
    "#%load ls ~/spark-latest/python/pyspark/sql/functions.py"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "slideshow": {
     "slide_type": "slide"
    }
   },
   "source": [
    "### Using SQL queries on DataFrames\n",
    "\n",
    "There are two main ways to manipulate  DataFrames:"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "slideshow": {
     "slide_type": "subslide"
    }
   },
   "source": [
    "#### Imperative manipulation\n",
    "Using python methods such as `.select` and `.groupby`.\n",
    "* Advantage: order of operations is specified.\n",
    "* Disrdavantage : You need to describe both **what** is the result you want and **how** to get it."
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "slideshow": {
     "slide_type": "subslide"
    }
   },
   "source": [
    "#### Declarative Manipulation (SQL)\n",
    "* Advantage: You need to describe only **what** is the result you want.\n",
    "* Disadvantage: SQL does not have primitives for common analysis operations such as **covariance**"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "slideshow": {
     "slide_type": "skip"
    }
   },
   "source": [
    "### Using sql commands on a dataframe\n",
    "Spark supports a [subset](https://spark.apache.org/docs/latest/sql-programming-guide.html#supported-hive-features) of the Hive SQL query language.\n",
    "\n",
    "For example, You can use [Hive `select` syntax](https://cwiki.apache.org/confluence/display/Hive/LanguageManual+Select) to select a subset of the rows in a dataframe.\n",
    "\n",
    "To use sql on a dataframe you need to first `register` it as a `TempTable`.\n",
    "\n",
    "for variety, we are using here a small dataframe loaded from a JSON file."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {
    "slideshow": {
     "slide_type": "skip"
    }
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "+----+-------+\n",
      "| age|   name|\n",
      "+----+-------+\n",
      "|null|Michael|\n",
      "|  30|   Andy|\n",
      "|  19| Justin|\n",
      "+----+-------+\n",
      "\n"
     ]
    }
   ],
   "source": [
    "# when loading json files you can specify either a single file or a directory containing many json files.\n",
    "path = \"../../Data/people.json\"\n",
    "\n",
    "# Create a DataFrame from the file(s) pointed to by path\n",
    "people = sqlContext.read.json(path)\n",
    "#print('people is a',type(people))\n",
    "# The inferred schema can be visualized using the printSchema() method.\n",
    "people.show()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {
    "slideshow": {
     "slide_type": "skip"
    }
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "root\n",
      " |-- age: long (nullable = true)\n",
      " |-- name: string (nullable = true)\n",
      "\n"
     ]
    }
   ],
   "source": [
    "people.printSchema()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {
    "slideshow": {
     "slide_type": "skip"
    }
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Justin\n"
     ]
    }
   ],
   "source": [
    "# Register this DataFrame as a table.\n",
    "people.registerTempTable(\"people\")\n",
    "\n",
    "# SQL statements can be run by using the sql methods provided by sqlContext\n",
    "teenagers = sqlContext.sql(\"SELECT name FROM people WHERE age >= 13 AND age <= 19\")\n",
    "for each in teenagers.collect():\n",
    "    print(each[0])"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "slideshow": {
     "slide_type": "subslide"
    }
   },
   "source": [
    "#### Counting the number of occurances of each measurement, imparatively"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {
    "slideshow": {
     "slide_type": "subslide"
    }
   },
   "outputs": [],
   "source": [
    "L=df.groupBy('measurement').count().collect()\n",
    "#L is a list (collected DataFrame)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {
    "scrolled": false,
    "slideshow": {
     "slide_type": "fragment"
    }
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "[('TOBS', 10956),\n",
       " ('TMAX', 13437),\n",
       " ('TMIN', 13442),\n",
       " ('SNWD', 14617),\n",
       " ('SNOW', 15629),\n",
       " ('PRCP', 16118)]"
      ]
     },
     "execution_count": 11,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "D=[(e.measurement,e['count']) for e in L]\n",
    "sorted(D,key=lambda x:x[1], reverse=False)[:6]"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "slideshow": {
     "slide_type": "subslide"
    }
   },
   "source": [
    "#### Counting the number of occurances of each measurement, declaratively."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {
    "slideshow": {
     "slide_type": "subslide"
    }
   },
   "outputs": [],
   "source": [
    "sqlContext.registerDataFrameAsTable(df,'weather') #using older sqlContext instead of newer (V2.0) sparkSession"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {
    "scrolled": true,
    "slideshow": {
     "slide_type": "fragment"
    }
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "\n",
      "SELECT measurement,COUNT(measurement) AS count,\n",
      "                   MIN(year) AS MinYear \n",
      "FROM weather  \n",
      "GROUP BY measurement \n",
      "ORDER BY count\n",
      "\n",
      "+-----------+-----+-------+\n",
      "|measurement|count|MinYear|\n",
      "+-----------+-----+-------+\n",
      "|       TOBS|10956|   1876|\n",
      "|       TMAX|13437|   1873|\n",
      "|       TMIN|13442|   1873|\n",
      "|       SNWD|14617|   1888|\n",
      "|       SNOW|15629|   1884|\n",
      "|       PRCP|16118|   1871|\n",
      "+-----------+-----+-------+\n",
      "\n"
     ]
    }
   ],
   "source": [
    "query=\"\"\"\n",
    "SELECT measurement,COUNT(measurement) AS count,\n",
    "                   MIN(year) AS MinYear \n",
    "FROM weather  \n",
    "GROUP BY measurement \n",
    "ORDER BY count\n",
    "\"\"\"\n",
    "print(query)\n",
    "sqlContext.sql(query).show()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "slideshow": {
     "slide_type": "subslide"
    }
   },
   "source": [
    "#### Performing a map command\n",
    "* In order to perform a `map` on a dataframe, you first need to transform it into an RDD."
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "slideshow": {
     "slide_type": "fragment"
    }
   },
   "source": [
    "* **Not** the recommended way. Better way is to use built-in sparkSQL functions.\n",
    "* Or register new ones (Advanced)."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "metadata": {},
   "outputs": [],
   "source": [
    "def find_century(row):\n",
    "    if row.Year < 1900:\n",
    "        return '19th'\n",
    "    elif row.Year <2000:\n",
    "        return '20th'\n",
    "    elif row.Year <2010:\n",
    "        return '21st'\n",
    "    else:\n",
    "        return 'possibly_bad_data'"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "metadata": {
    "slideshow": {
     "slide_type": "subslide"
    }
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "['20th', '20th', '20th', '20th', '20th']"
      ]
     },
     "execution_count": 15,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df.rdd.map(find_century).take(5)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "slideshow": {
     "slide_type": "slide"
    }
   },
   "source": [
    "#### Aggregations \n",
    "* **Aggregation** can be used, in combination with built-in sparkSQL functions \n",
    "to compute statistics of a dataframe.\n",
    "* computation will be fast thanks to combined optimzations with database operations."
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "slideshow": {
     "slide_type": "fragment"
    }
   },
   "source": [
    "* A partial list : `count(), approx_count_distinct(), avg(), max(), min()`"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "slideshow": {
     "slide_type": "fragment"
    }
   },
   "source": [
    "* Of these, the interesting one is `approx_count_distinct()` which uses sampling to get an approximate count fast."
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "slideshow": {
     "slide_type": "skip"
    }
   },
   "source": [
    "* [The gory details](http://spark.apache.org/docs/2.2.0/api/python/_modules/pyspark/sql/functions.html)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "metadata": {
    "slideshow": {
     "slide_type": "skip"
    }
   },
   "outputs": [],
   "source": [
    "import pyspark.sql.functions as F # used here just for show."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 17,
   "metadata": {
    "slideshow": {
     "slide_type": "subslide"
    }
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "+------------------------------+\n",
      "|approx_count_distinct(station)|\n",
      "+------------------------------+\n",
      "|                           339|\n",
      "+------------------------------+\n",
      "\n"
     ]
    }
   ],
   "source": [
    "df.agg({'station':'approx_count_distinct'}).show()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "slideshow": {
     "slide_type": "subslide"
    }
   },
   "source": [
    "#### Approximate Quantile\n",
    "\n",
    "* Suppose we want to partition the years into 10 ranges\n",
    "* such that in each range we have approximately the same number of records.\n",
    "* The method `.approxQuantile` will use a sample to do this for us."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 18,
   "metadata": {
    "slideshow": {
     "slide_type": "fragment"
    }
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "with accuracy 0.1:  [1871.0, 1945.0, 1949.0, 1965.0, 1972.0, 1978.0, 1984.0, 1988.0, 2013.0]\n",
      "with accuracy 0.01:  [1917.0, 1936.0, 1949.0, 1957.0, 1966.0, 1975.0, 1984.0, 1993.0, 2003.0]\n"
     ]
    }
   ],
   "source": [
    "print('with accuracy 0.1: ',df.approxQuantile('year', [0.1*i for i in range(1,10)], 0.1))\n",
    "print('with accuracy 0.01: ',df.approxQuantile('year', [0.1*i for i in range(1,10)], 0.01))"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "slideshow": {
     "slide_type": "subslide"
    }
   },
   "source": [
    "#### Lets collect the exact number of rows for each year\n",
    "This will take much longer than ApproxQuantile on a large file"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 19,
   "metadata": {
    "slideshow": {
     "slide_type": "fragment"
    }
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "SELECT year,COUNT(year) AS count FROM weather GROUP BY year ORDER BY year\n",
      "counts is  DataFrame[year: int, count: bigint]\n"
     ]
    }
   ],
   "source": [
    "# Lets collect the exact number of rows for each year ()\n",
    "query='SELECT year,COUNT(year) AS count FROM weather GROUP BY year ORDER BY year'\n",
    "print(query)\n",
    "counts=sqlContext.sql(query)\n",
    "print('counts is ',counts)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 20,
   "metadata": {
    "slideshow": {
     "slide_type": "fragment"
    }
   },
   "outputs": [
    {
     "data": {
      "image/png": "iVBORw0KGgoAAAANSUhEUgAAAYAAAAEKCAYAAAAb7IIBAAAABHNCSVQICAgIfAhkiAAAAAlwSFlzAAALEgAACxIB0t1+/AAAIABJREFUeJzt3Xd4FVX6wPHvm04qJEBISCD0Ki1UURcEC4piQ0WlWJZdG+o2cXddt7irq/50dd21IuJa0LXBYkEpQQFD74QSQkgCIYEkpPd7fn/cIQYIkHZLct/P89wnc8+cmfveSTLvzJmZc8QYg1JKKc/j5eoAlFJKuYYmAKWU8lCaAJRSykNpAlBKKQ+lCUAppTyUJgCllPJQmgCUUspDaQJQSikPpQlAKaU8lI+rAziX9u3bm7i4uGZfb3FxMUFBQc2+XmfQ2F1DY3cNjb1xNm3adNwY0+F89dw6AcTFxbFx48ZmX29CQgLjxo1r9vU6g8buGhq7a2jsjSMih+pTT5uAlFLKQ2kCUEopD6UJQCmlPJRbXwNQSqnzqaysJCMjg7KyMleHcoqwsDCSkpIc+hkBAQHExMTg6+vbqOU1ASilWrSMjAxCQkKIi4tDRFwdTo3CwkJCQkIctn5jDDk5OWRkZNCtW7dGrUObgJRSLVpZWRkRERFutfN3BhEhIiKiSWc+mgCUUi2ep+38T2rq99YEoJSbWLknm+0ZJ1wdhvIgmgCUcrGqaht/WbKbO9/ewN0LNlJcXuXqkJSb+cc//kFJSUmzr1cTgFIuVF5VzZ1vb2De6oNMGtiJY4XlvPZdSr2WTUzJIb+00sERKnegCUCpVuirHUf5fv9x/nTtAF65I57Jg6J4/bsDHM0/94W9IydKmfZGIs8t3eukSNX5vPPOOwwaNIjBgwczffp00tLSmDBhAoMGDWLChAmkpaUBMGvWLD7++OOa5YKDg4Efu4646aab6Nu3L7fffjvGGF566SWOHDnC+PHjGT9+fLPGfN7bQEXkLWAykG2MGWiVhQMfAnFAKnCzMSZP7FckXgSuAkqAWcaYzdYyM4HfW6t90hizoFm/iVIt0Pvr04iLCGT66K4APHplX77ZlcWTX+zm91f3JzLUv84LfV/uyMQY+N/2Izw+uT9+PnosB/Cn/+1i95GCZl1n/+hQnrhmwDnr7Nq1i7/+9a+sWbOG9u3bk5uby+23386MGTOYOXMmb731FnPmzOHzzz8/53q2bNnCrl27iI6OZuzYsaxZs4Y5c+bw/PPPs3LlStq3b9+cX61eZwBvA1eeVjYXWG6M6QUst94DTAJ6Wa/ZwCtQkzCeAEYBI4EnRKRdU4NXqiVLzi5i/cFcbhnRBS8v+04+NjyQey7uxpLtmYx+ajmD//QNa5OPn7HslzsyaePrzYmSSlbtO3bKvMKySmbNX8/nWw475XsoWLFiBTfddFPNDjo8PJz169dz2223ATB9+nRWr1593vWMHDmSmJgYvLy8GDJkCKmpqY4M+/xnAMaY70Qk7rTiKcA4a3oBkAA8apW/Y4wxQKKItBWRKKvut8aYXAAR+RZ7Uvmgyd9AqRbqww1p+HgJN8XHnFL+q8v7cEnvDuzPKuTZpXv5dMthLuz545HfkROlbE47wSMTe/POD6l8tiWDy/pHAlBtMzy8cCsJe49x8Hgx1w6OduZXcrnzHak7ijHmvLdknpzv4+ODzWarWa6ioqKmjr+/f820t7c3VVWOvSGgsU8CRxpjMgGMMZki0tEq7wyk16qXYZWdrfwMIjIb+9kDkZGRJCQkNDLEsysqKnLIep1BY3eN5o690mb4ILGEIR282bXphzrrxAJ92xqW7TzMyva5NTuQpan2C7+R5ekMa2/jm11H+eLblQT5Ch/vq2B5SiUD23uz83gJr3++ghi/0la93cPCwigsLHROQGcxevRobrvtNu655x4iIiLIzc1l5MiRzJ8/n2nTpvHee+8xevRoCgsLiYqK4ocffmDSpEksWbKEyspKCgsLKSkpoaqqqua7VFRUUFZWRmFhIUFBQWRmZp6SIE4qKytr9O+3ubuCqCsFmnOUn1lozOvA6wDDhw83juhPW/sYdw2N/UeLth6mqHIrc66O55LeZx+3Izs4nd98vJ1OfePpFxUKwD9fWUu/qABuvfpi+qWf4Nt/rWFndRTpx0pYkpLJtJFd+MPk/oz86zL2VUXQN/xEq97uSUlJDu1yoT5GjhzJ448/zuTJk/H29mbo0KE8++yzzJkzh5dffpkOHTowf/58QkJCeOCBB5gyZQoTJkxgwoQJBAUFERISQmBgID4+PjXfxc/Pj4CAAEJCQvj5z3/O1KlTiYqKYuXKlad8dkBAAEOHDm1U3I1NAFkiEmUd/UcB2VZ5BvYDl5NigCNW+bjTyhMa+dlKtWhF5VU88/VeenYM5qKe576od0kve3L4bt8x+kWFkplfyqZDefzq8t4ADIoJo3uHIF5JOECQnzf3jevBwxN74+fjxTVDovl0cwYTLwlw+HdSMHPmTGbOnFnzvrCwkBUrVpxRLzIyksTExJr3Tz31FADjxo07Jdm9/PLLNdMPPvggDz74YLPH3NhbBxYDJ7/pTGBRrfIZYjcayLeaipYCl4tIO+vi7+VWmVIe57mlezmSX8rTN1xQc/H3bDqFBdC3U0jNhd75a1IBuHqQvW1fRHhyykB+c2UfVj96Kb+5sm/NHUE3D4+lrNLG+qP6YJmq23kTgIh8APwA9BGRDBG5G3gauExE9gOXWe8BvgRSgGTgDeA+AOvi71+ADdbrzycvCCvlSTYdymXBD6nMHBPH8Ljwei3zk94d2JCay+a0POatPsgtw2Pp1v7HsWYv7Nme+8b1pF2Q3ynLDY4Jo3dkMKsy6k4A7yYe4tqXV1NZbWv091Et23kTgDFmmjEmyhjja4yJMcbMM8bkGGMmGGN6WT9zrbrGGHO/MaaHMeYCY8zGWut5yxjT03rNd+SXUspd/XlJEtFhbfj1FX3qvcwlvTtQWW24++0NhAb4MHdS33otJyJMG9mFg/k2tqTlnTIvp6icv3+1h+0Z+ayp4zbTqmobGXnN/+Spo9hvPPQ8Tf3e+vSIUk5ijGHv0QKuHNiJIP/6X34bHteONr7e5JVU8thV/c440j+XqcNjaeMDb1lNRyf9Y9l+SiqrCfLzZvHWI6fMK6us5q4FG7n4mZX8b9uP82w2Q5Ubni0EBASQk5PjcUng5HgAAQGNv8ajA8Io5SSF5VWUVdroGHLmrXzn4u/jzdWDosguLOemYTHnX6CWYH8fLonx4csdmfz2qr5EhbUhObuQ99encdvILlRW2/jftiOUVlTTxs+bkooq7lmwkR9ScujePohHPtxKG19vyqtsPPVVEhHB/nz0s9H4+3g3KA5HiomJISMjg2PHjp2/shOVlZU1aedcHydHBGssTQBKOUl2QTkAHUMblgAAnps6uF4PG9VlYhdfvj1UxTs/HGL66K7M/WQHgb7ePDyxF3uzClm4IZ3le7K4rH8k9yzYSGJKDs/fPJiJ/SK5/c113POOvSW3W/sgtqWf4Jmv9/L45P4NjsNRfH19Gz0iliMlJCQ0+vZMZ9EEoJSTHCu0EkBI444KGzv4R4dAL64Y0IkFa1OZv+YgNgN/u/4CIoL9GRXoR2SoP59vOcy3u7NYe8C+879+qP2ocsGdI3li8S5Gd4/glhGx/Pl/u5i3+iBje0YQGuDL9/uPM6ZHBKO7RzQqNuVamgCUcpLsQnsPnw1tAmoOsy/pzrKkLK4Y0Im5k/oS0y4QAG8vYfKgaOatPgjAr6/oww21mpnaBfnx0rQfj2Ifu6of6w7mctfbNfd38OLy/Uwb2YXHrupLaEDjBid3hbUHjhMZGkCPDsGuDsVlNAEo5SRNPQNoiqFd2rHrT1fW2Wvo9UM7M3/NQW4d2YX7xvU453oCfL359+3DeP27FMb0iODCHu154/sU3vw+hcSUHD6cPZqOoef/fqv3H2dvViHhQb5c0LktPTs6bydcbTM8981eXkk4QIcQf76Yc5FLfifuQBOAUk6SXViOn48XoW1c8293ti6jB3YOY83cS+kUGlCvZqbuHYJ5+sZBNe9/e1U/JvaLZNb89dwxbx0fzh5zzjuVVu7J5q4FGzh5046ftxfLfvETukQENuwLNUJxeRUPfrCFFXuyuWZwNN/uPsqcD7bw7t2jqDaGzYdOMCKuHT7ennGDpGd8S6XcQHZBGR1D6u7f39Wiwto0Ka6R3cJ5c8ZwUnNKmDl/PSUVdT98lpxdyJwPttA/KpR1v53A4gfGIgIvrdjf6M+ur8KySma8tZ5V+47xl+sG8s9pQ/nrdReQmJLLjLfWM+apFUx7I5G31hx0eCzuQhOAUk6SXVjukvZ/Z7mwZ3v+ddswtmfk80rCgZrysspqVu7J5vXvDnD3go34+3rx+ozhRIYGMCimLXeM7sqnmzM4cKzIYbFlFZRxx7z1bEs/wT+nDa0ZgOfG+BjuGN2FdQdzGRHXjn5Roby3Lg2bzTOeKdAmIKWcJLuwnJ6t/ILjZf0juW5INK99l8LU+Fg6hvpz+5vr2HTI/iRydFgAr02Pp3PbNjXL3DuuB++vS+PFZftPueAMYGviw10vr9jPZ1sOc+BYMb7ewr9vH8blAzqdUucvUwYyd1I/gv19WLT1MA8t3Mrq5OPn7KW1tdAEoJSTZBeUcWGP1n+75NxJ/fhmdxZ/XrIbHy9hc1oef7/xAq4Y0Im2gWdeG2gf7M+ssXG8uuoAsy/pzsDOYQC8tuoAL68o4bMBhfTs2PDunr/ZdZTnvtnHyG7h3BQfy8R+HekVeeZ6RIRg68nsKwd2IiLIj3cTD3lEAtAmIKWcoKyymoKyqlbdBHRSp7AA7h/fk2VJWXy96yi/v7o/t4zoUufO/6SfXdKd9sH+3PfeZvKKK9h0KI9nlu6lsBIe/GArZZXV5/zMapvh8c93cvubiRwvKqekooo//W83fSJDeO+eUdw7rkedO//T+ft4c/OIWJYlZZF6vJj//JDK7z7b4ZZdYDQHPQNQyglceQuoK9x9UTdW7TvGqG7h3H3R+Z/SbRvox2vT47n1tUTufW8TGXmlRIUFcHWsjde2F/D3r/ecdbjHqmobv/zvNhZtPWIfYvOVtcR3DefwiVL++/Mx+Dbwjp7bRnbh1VUHuOyFVVRW25ugJg+KZkwrPHvTMwClnODkQ2AdGtENREsU4OvNRz8bwy8vr3+vp8O6tONvN9jvysnML+OlaUMZE+3DrAvjmL8mleVJWWcsc/K2zkVbj/CbK/vw4c9Gc6K0kk82Z3Dz8BhG1LPL7dpiwwO5Y1RXRnePYN7M4fh6Cyv3Zp9/wRZIzwCUcoKafoA8oAmoKW6Kj6G0sprQAB+GdWlHQgrMndSXDam5/OKjbSx58CJiw+3PC+w6ks+D728hNaeYxyf3rznT+PjnF/Ju4iEemtCr0XH85bqBNdOjukWwYk82v72qX9O+nBvSMwClnCDbw5qAmmL66K5MGdK55n2Arzev3B6PzRjufW8T6bklPLlkN9f/ay3FFVW8d8/oU5qZenYM5o/XDmhQt9nnMr5vR5Kzi0jPbTnjI9SXJgClnCC7sAxvLyGimXZKnqZLRCDP3zyEnYcLuPiZlcxfm8rkwVF89dAlDm+bv7RvRwBW7Gl9zUDaBKSUE2QXlNM+2O+8YwCrs7usfyR/mNyflONF/PTi7nSNCDr/Qs2gW/sgurUPYsWebGZeGOeUz3QWTQBKOYH9KWBt/mmqu+pxR5EjjO/TkXfXHaKkoopAv9az29QmIKWcoLV3A9HaXdq3IxVVNtYm57g6lGalCUApJzhWWNaokcCUexjZLZzQAB++3Jnp6lCaVes5l1HKTVVV28gprqCDNgG1WH4+XkwaGMWS7Ucou76aAN/mGRN5R0Y+G1JzySupoF2gX4ObuCqrbWw4mMuFPds36vM1ASjlYMeLKjBGnwFo6aYMiebDjeksT8rm6kFRTV5fZbWNmfPXk1tcUVM2rGs7hsS2rfc6Pt2cwaOf7GDxA2MZFFP/5U7SJiClHCwjz37/uCaAlm1U9wg6hvizaOvhZlnf6v3HyS2u4OXbhrL9j5cT7O/DgrWpDVpHYkouAD8caNy1CU0ASjmQMYYXlu0jJMCH+K7tXB2OagJvL+GawdEk7D1GfkklxhhSjxfzza6jvPl9Codyihu0vsXbjhDWxpfL+3ciNMCXm+Jj+GJ7Zk2/UfWx/qA9AayzfjaUJgClHGjxtiOsSc7hN1f0ISJYzwBauilDoqmotvHsN3u46dUfGPdcArP/s4knv0jihn+vJSmzoF7rKa2oZumuo1x1QVTNUJ3Tx3SlotrGwvVp9VpHRl4Jh0+UEuTnzYaDuVQ3YhAbTQBKOUh+aSV/WZLE4JgwbhvV1dXhqGZwQecwurUP4t3ENDJPlPL45P4sun8sX8y5CD8fL6a9kci6lJzzdh+9LCmLkopqpgyJrinr0SGYi3u15911h6isR/fTG1LtR/13jOlKYXlVvZNPbXoRWCkHeWn5fnKLy3n7zhF46xPArYKI8NzUwRzKKWbyoOiao3eAD2ePYdobidzyeiJ+3l50CoRftj3MtYOjzxhvedHWI3QKDWDkab2VzrowjrsXbGTIn74hItif64d25pHLetcZy/qDuYQE+DBzTByvrUohMSWnZjCd+tIzAKUcIKeonPfWHeK6oZ0b/E+p3Ft813bcMCzmlJ0/2PsrWvzAWJ6bOpi7LuqGAR5auJUbXllLcnZhTb284gpW7cvm2iHRZ3QNMr5PR/54TX9uHhFLTLs2vLh8P59uzqgzDvs4xuFEt21D14jARl0HaNIZgIg8AtwDGGAHcCcQBSwEwoHNwHRjTIWI+APvAPFADnCLMSa1KZ+vlLt6e20q5VU27hvXw9WhKCeKCPbnpvgYAEYGZJIT0pOnvtrDA+9vYcmDF+Hj7cW/E5KpspmaerV5eQmzxtqfBaiqtnH7m+v47Wc7GBAdRu/IYHKLKwgP8uN4UQUpx4q5eXgsAKO6hbN0VxY2m2lQf1ONPgMQkc7AHGC4MWYg4A3cCvwdeMEY0wvIA+62FrkbyDPG9AResOop1eoUllWyYG0ql/ePbNRYtqp18BJh6vBY/nb9QPYcLeT99Wmk5ZSwYO0hpsbH0Ps8Q1T6eHvxz9uGEhLgy9RX1zLwiaXEP7mMW19P5ONN9rOCkd3sTUijukWQX1rJnqOF51rlmZ/RuK92yvJtRKQSCAQygUuB26z5C4A/Aq8AU6xpgI+Bl0VEjDENv3StlBt7f10aBWVV3Deup6tDUW7gigGdGNszgv/7Zh/Lk7Lx9pJ6j5TWMSSAN2YM59WEA3QKC6BtoC/zVh9k3cFcAny9GBhtb14c1d2eCNYfzKF/dGi9Y2t0AjDGHBaR54A0oBT4BtgEnDDGVFnVMoCTIzt0BtKtZatEJB+IAI43Ngal3I0xhrfXpjK2ZwSDG/BEp2q9RIQnrhnApBe/Z9W+Yzw8sReRofXvFmRIbFtenR5f837ayC48+UUSkSH+NdchYtoFEuDrxZH8sobF1tgDcBFpB3wC3AKcAP5rvX/CauZBRGKBL40xF4jILuAKY0yGNe8AMNIYk3PaemcDswEiIyPjFy5c2Kj4zqWoqIjg4OBmX68zaOyuUd/Y88psPJJQyh39/JjY1dcJkZ2fJ2x3d3R67J/ur2BTVhV/GN0Gf5/mvytszopihkX6MGuAP+PHj99kjBl+3oWMMY16AVOBebXez8De1HMc8LHKxgBLremlwBhr2seqJ+f6jPj4eOMIK1eudMh6nUFjd436xr5qb7bp+ugSszb5uGMDagBP2O7uqK7Yq6ttDvu8S55ZYeZ8sNkYYwyw0dRjP96U20DTgNEiEij2m1wnALuBlcBNVp2ZwCJrerH1Hmv+CitQpVqNfVn2i3B9OunFX3UmR44IF+TnQ3F51fkr1tLoBGCMWYf9Yu5m7LeAegGvA48CvxCRZOxt/POsReYBEVb5L4C5jf1spdzVnqOFdAjxJ1zH/lVOFuzvQ1EDE0CT7gIyxjwBPHFacQowso66ZdibjZRqtfZlFdLnPLf3KeUIQf7eHC+qOH/FWvRJYKWaic1m2JdVeN77u5VyhCB/H4ornNQEpJQ6VXpeCWWVNvpq+79ygWB/J14DUEqd6uRTmL01ASgXCPTzobi8ukHLaAJQqpnssxJAr44t87511bIF+3tTXFFFQ26u1ASgVDPZm1VIl/BAgvy1l3XlfEH+PhgDJRX1PwvQBKBUM9l7VC8AK9c5eeDRkOsAmgCUagblVdUcPF5Mn07a/KNcI9hKAA15FkATgFLN4ODxYqpshj6d6t8To1LN6eQZgDYBKeVki7YeAexjxirlCkF+3oCeASjlVKnHi5n3/UFuHBZDt/ZBrg5HeSi9BqCUCzz5xW78fLx49Mr6DfKhlCME6TUApZxr5d5sliVl8+ClPenYgEE+lGpuwTVnAHoNQCmHO5pfxqMfb6d7hyDutAbyVspVgvzt1wC0CUgpByurrOZn/9lIcXkVr9weXzM0n1KuEuTX8CYgfWRRqUb47ac72JaRz+vT43XwF+UWvLyEQD9vShrQI6getijVQEdOlPLplsP87CfduXxAJ1eHo1SNQD8fivQagFKOsyE1F4BrBkW7OBKlThXs763XAJRypHUHcwnx96FflD71q9xLUAPHBNAEoFQDrT+YS3xcO7wdOMC3Uo0R1MBxgTUBKNUAOUXlJGcXMbJbuKtDUeoMwQ0cFlITgFINsCE1D4BRmgCUG7I3AelFYKUcYv3BXPx9vLigc1tXh6LUGYL89CKwUg6zPjWHoV3a6oNfyi3pRWClHKSwrJLdRwoY2S3C1aEoVacgfx+KdTwApZrfpkN52Iy2/yv3FWz1B1RfmgCUqqdt6fmIwOBYbf9X7ulkl9D1pQlAqXranZlPXERQTbe7Srmbhv5tagJQqp52ZxbQP1qf/lXuK9BPE4BSza640pCeW0p/7f5BubEgZ14DEJG2IvKxiOwRkSQRGSMi4SLyrYjst362s+qKiLwkIskisl1EhjXls5VypvRCGwAD9AxAuTFnNwG9CHxtjOkLDAaSgLnAcmNML2C59R5gEtDLes0GXmniZyvlNIcK7AlAm4CUO3PaRWARCQUuAeYBGGMqjDEngCnAAqvaAuA6a3oK8I6xSwTaikhUYz9fKWdKK7DRIcSfjiE67q9yX848A+gOHAPmi8gWEXlTRIKASGNMJoD1s6NVvzOQXmv5DKtMKbeXVmjT9n/l9hp6BtCU+9l8gGHAg8aYdSLyIj8299Slrr5zzRmVRGZjbyIiMjKShISEJoRYt6KiIoes1xk0duertBkOF1UzuOpEi4y/pW530NgbymbO2KWemzGmUS+gE5Ba6/3FwBfAXiDKKosC9lrTrwHTatWvqXe2V3x8vHGElStXOmS9zqCxO9+OjBOm66NLzP+2HXZ1KI3SUre7MRp7Y/R7/CsDbDT12I83ugnIGHMUSBeRPlbRBGA3sBiYaZXNBBZZ04uBGdbdQKOBfGM1FSnlznYfKQBgQHSYiyNR6vwa0gzU1EcaHwTeExE/IAW4E/t1hY9E5G4gDZhq1f0SuApIBkqsukq5vd2ZBfh7Q9fwQFeHotR5NeRCcJMSgDFmKzC8jlkT6qhrgPub8nlKOVtpRTXLkrKIC/XCS4eAVC1AQx4G0yeBlTqH57/dS0ZeKdf19HN1KErVS1ADuoPQBKDUWWxLP8G81Qe5bVQX+kU07BF7pVzFaU1ASrU2i7cd4dGPtxMb3oaC0io6hPgzd1JfNieucXVoStVLYAMSgJ4BKFXLVzsyCfD1okt4IKFtfHjmpsGEBvi6Oiyl6m1Yl/qPV6FnAEpZbDbDuoO5XNo3kv+7ebCrw1GqUe4c24276llXzwCUsuzLLiS3uILR3XXIR+UZNAEoZUk8kAPA6O466LvyDJoAlLIkpuQS064NsfrAl/IQmgCU4mT7f44e/SuPoglAKWBvViF5JZWaAJRH0QSgFJCYcrL9Xy8AK8+ht4Eqj2WMYcn2TPZnF/H1zkxiw9sQ007b/5Xn0ASgPNZXO4/y4AdbEIG2bXy55+Lurg5JKafSBKBaJZvNUGUz+PnU3cppjOG171LoGhHIsl/8BF9vbQ1Vnkf/6lWr9LvPdzLmqeVsScurc/6G1Dy2pZ/gnou7685feSw9A1CtTnJ2IR9uSMNLhNveWMfzNw+mjZ83mw/l0adTKFdd0InXvztAeJAfNw2LcXW4SrmMJgDV6rywbD9tfL357P6x/OKjrdz73uZT5g+OCWNbRj4PTehFGz/t5ll5Lk0AqlVJyizgi+2ZPDC+J70jQ1g4ewyLtx6hS3ggQ7q05YvtR3h26T4C/byZMaarq8NVyqU0AahW5flv9xES4MNPrTt6gv19uG1Ul5r5t4zowuRB0eSXVhIR7O+qMJVyC5oAVKvx8aYMvt2dxa+v6ENY4Nn78A/y9yGoAYNmKNVa6e0PqsVKzi5k9f7jVNsMSZkF/P7zHYzpHsHPLtH7+ZWqDz0MUi2OzWZ4a81B/v71HiqrDdFhAYgIoQG+vDhtCD56W6dS9aIJQLUoR/PLmPvpdhL2HuOy/pFcOziajzamszE1j7fvHEHHkABXh6hUi6EJQLUIxhg+3JDOX79IotJm489TBjB9dFdEhGsGR2OMQURcHaZSLYomAOX2corK+fXH21mxJ5vR3cN5+oZBxLUPOqWO7vyVajhNAMqtbTqUy33vbSavuJI/XtOfGWPi8PLSnb1SzUETgHJbNpvh4Q+34u/jzWf3j2BAdJirQ1KqVdHbJZTbSkzJIT23lF9e3lt3/ko5gCYA5bY+2phOaIAPVwzo5OpQlGqVNAEot5RfWslXO48yZUhnAny1wzalHKHJCUBEvEVki4gssd53E5F1IrJfRD4UET+r3N96n2zNj2vqZ6uWq7Simqe/2sOxwvI65y/edoTyKhs3D491cmRKeY7mOAN4CEiq9f7vwAvGmF5AHnC3VX43kGeM6Qm8YNVTLVBZZTVvfJfC++vSGr2Od35I5dVVB3j9uwN1zv/vxnT6RYUysHNooz9DKXVuTUrAJ0LgAAAVTklEQVQAIhIDXA28ab0X4FLgY6vKAuA6a3qK9R5r/gTRm7dbnK92ZDLx+VX89csk/rBoJ5n5pQ1eR2lFNW98nwLAhxvSKamoOmX+yyv2sz0jn1tHxOr9/Uo5kBhjGr+wyMfAU0AI8CtgFpBoHeUjIrHAV8aYgSKyE7jSGJNhzTsAjDLGHD9tnbOB2QCRkZHxCxcubHR8Z1NUVERwcHCzr9cZmhL7skOVrEivZGIXXy6O8cG3AffTl1YZ/rO7grVHqogJFiZ18+XNHRVc2c2XW/r4nXf51PxqpLKUru2DWZpayQd7Kripty8f76tk1gA/xsX6Yozh8+RKFh2oZEy0N/cM9MfbTe7599S/GVfT2Btn/Pjxm4wxw89Xr9HPAYjIZCDbGLNJRMadLK6jqqnHvB8LjHkdeB1g+PDhZty4cadXabKEhAQcsV5naGzs+SWVPJiwApvNi3d2V/BNhhdThkYzaWAUg2PCznmkvfdoIT/7z0bScqt4eGIvHhjfEx9vLzLf38yqvcd4ZuZYQgLs3S/nl1byr5XJFJZV8eR1A/H2Eg4cK2L2i9+DTfjFFbEsP3yQC3tE8Oydo9j10moScwy/u+0inlyym0UHDjE1PoanbxzkNjt/8My/GXegsTtWUx4EGwtcKyJXAQFAKPAPoK2I+BhjqoAY4IhVPwOIBTJExAcIA3Kb8PmqAd74PoXCsiq+nHMxucUVvPF9CvO+P8hrq1K4rH8kr94RX+cONzElh5++s5E2vt588NPRjOoeUTNv9iXdWbI9k4Xr05k2qgv/3ZjOS8v3k1dSCUCHYD8entibuZ9sp42vN3HBwtNf7QHgpWlDERFmXdiVRz/ZweSXVrM3q5CfXtyNxyb106d9lXKCRicAY8xjwGMA1hnAr4wxt4vIf4GbgIXATGCRtchi6/0P1vwVpintT6recosrmL/mIFcPiqJ/tP2i6kW92pNfUsnba1N5Ydk+nl26l7mT+p6y3OdbDvObj7fTJSKQt+8cQUy7wFPmD4ppy+ju4by0Yj//WLaP4opqRncP5/HJ/Zm/JpWXViRzKLeEDal5PDd1MBEF+ymJ6Muh3GJGW4nk2sGdeeqrPRzMKeb/pg7mxngdpF0pZ3FEVxCPAgtF5ElgCzDPKp8H/EdEkrEf+d/qgM/2WFkFZSzZnsnSnUdp4+fNs1MH1XSN/OqqA5RWVvPIxF6nLBMW6MtDE3uRVVjGq6sOEBcRyKQLoiivrOYPi3bx9a6jjIhrxxszhtM2sO52/ocm9Oa+9zYxYUAnbh/VhSGxbRER/jJlIDsP57No6xEu7tWeG4d1ZtWqZK4eFHXK8m38vHn37lH4+3jRKzLEMRtHKVWnZkkAxpgEIMGaTgFG1lGnDJjaHJ/n6UorqgH7zhMgObuI6/61hqLyKvp2CmHH4XxufGUtL906lPfWpfHxpgxuHBZDz45172D/eM0AkrOLmPvpDuZ+ugMAPx8vHr2yLz+9uNs5B1gZ0yOCLX+4/IzyNn7evDY9nhe+3cevr+x7zmsMAztrNw9KuYJ2BtcCPfrJdlYnH+etWSPo1TGYe9/dhJ+PF0vvvYQ+nULYmn6Cu97ewPX/Xou3l3D/+B48eGmvs67Pz8eLt2aNYNnuLI4XlVNUXsU1g6Pp0aFpdzB0jQjiH7cObdI6lFKOowmghTHG8P3+Y+SVVDLt9UQGxYSRfKyI/9w1ij6d7Ef4Q2Lb8sm9F/L6dwe4fVTXeh1hB/v7cN3Qzo4OXynlRjQBtDDZJYa8kkoemdibb5OOsu5gLr+8rDcX9Wp/Sr1u7YN46oZBLopSKdUSaAJoYQ7k2wC4fEAkd1/cjXUpOYzv09HFUSmlWiJNAC3MgRPVBPl50zsyBG8vYUK/SFeHpJRqobQ76BYm+YSNwbFt3eopWaVUy6QJoAUpragmvdDG0C5tXR2KUqoV0ATQguw4nI/NwNDYdq4ORSnVCmgCaEG2pOUB6BmAUqpZaAJoQbaknaBjoBAR7O/qUJRSrYAmgBbCGMPmtDx6hOmvTCnVPHRv0kJsTT9BdmE5fcJ1gHSlVPPQBNBCvLUmlZAAH0ZH6aMbSqnmoQmgBThyopQvd2Ry64hYAnz0/n+lVPPQBNACvPPDIYwxzLwwztWhKKVaEU0Abq6koooP1qdx5cBOZ4zIpZRSTaEJwM29vy6N/NJK7hrbzdWhKKVaGU0Abux4UTkvLtvPJb07EN9Vn/5VSjUvTQBu7Nmv91JaWc0T1/Q/55CKSinVGJoA3NS29BN8tCmdO8fGNXloRqWUqosmADf1zNI9RAT5M2fC2cfyVUqpptAE4IYy8kpYk5zDjDFdCQnwdXU4SqlWShOAG1q09QgA1+sg7UopB9IE4GaMMXy6OYMRce2IDdf7/pVSjqMJwM3sPFzAgWPFXKdH/0opB9ME4GY+23IYP28vJl8Q7epQlFKtnCYAN1JVbWPxtiNc2rcjYYF68Vcp5ViaANzIqn3HOF5UzvXDtPlHKeV4mgDcyEcb02kf7MelfTu6OhSllAdodAIQkVgRWSkiSSKyS0QessrDReRbEdlv/WxnlYuIvCQiySKyXUSGNdeXaA2OF5WzPCmbG4bF4OuteVkp5XhN2dNUAb80xvQDRgP3i0h/YC6w3BjTC1huvQeYBPSyXrOBV5rw2a3OZ5sPU2UzTI2PcXUoSikP0egEYIzJNMZstqYLgSSgMzAFWGBVWwBcZ01PAd4xdolAWxGJanTkrYgxho82pjO0S1t6RYa4OhyllIdolrYGEYkDhgLrgEhjTCbYkwRwskG7M5Bea7EMq8zjbUk/wf7sIm4ZHuvqUJRSHqTJI4yLSDDwCfCwMabgHN0W1zXD1LG+2dibiIiMjCQhIaGpIZ6hqKjIIettrDe2l+PvDWEFB0hISDlnXXeLvSE0dtfQ2F2jRcRujGn0C/AFlgK/qFW2F4iypqOAvdb0a8C0uuqd7RUfH28cYeXKlQ5Zb2NkFZSaXr/90vzh8x31qu9OsTeUxu4aGrtruDJ2YKOpxz68KXcBCTAPSDLGPF9r1mJgpjU9E1hUq3yGdTfQaCDfWE1FnuzdxDQqbTZm6ZCPSikna0oT0FhgOrBDRLZaZb8FngY+EpG7gTRgqjXvS+AqIBkoAe5swme3CmWV1byXeIgJfTvSrX2Qq8NRSnmYRicAY8xq6m7XB5hQR30D3N/Yz2uNFm87Qk5xhQ74rpRyCX3iyEVsNsOb36fQt1MIY3pEuDocpZQH0gTgIl/syGRfVhH3juuhA74rpVxCE4ALVFXbeGHZPvpEhnDNIO32WSnlGpoAnOR4UTkr92ZTUWVj0dYjpBwr5pHLeuHlpUf/SinXaPKDYKp+/r3yAG+tOUj7YD9AGBAdyhUDOrk6LKWUB9MzACfJyCshMtSfoV3aUVBayaNX9tW2f6WUS+kZgJNkFZTROzKEN2YMx2Yz2vSjlHI5PQNwkqMFZXQKDQDQnb9Syi1oAnCCqmobxwrL6RQW4OpQlFKqhiYAJzheVIHNQGSoJgCllPvQBOAERwvKAGqagJRSyh1oAnCCrJMJQJuAlFJuRBOAE5xMANoEpJRyJ5oAnOBofhk+XkJEkJ+rQ1FKqRqaAJzgaEEZHUP89fZPpZRb0QTgBFkFZURq+79Sys1oAnCCo/llegeQUsrtaAJwgqyCcr0ArJRyO5oAHKyovIqi8iq9BVQp5XY0ATjY0Xx9CEwp5Z40AThYtj4DoJRyU5oAHOyoPgWslHJTmgAcTPsBUkq5K00ADpaVX0ZogA9t/LxdHYpSSp1CE4CDHS0o0+YfpZRb0gTgYEf1GQCllJvSBOBAydmFpBwr0vZ/pZRb0gTgIMuTsrjuX2vx9/Fi1tg4V4ejlFJn8HF1AC1VVkEZy5OyuW5oNIF+P27GPUcL+OeKZL7ckcmA6FBenz6c6LZtXBipUkrVzekJQESuBF4EvIE3jTFPOzuGpjDG8NHGdJ78IonCsir+nZDM366/gPIqG++tO0TC3mME+/tw37gePDC+l979o5RyW05NACLiDfwLuAzIADaIyGJjzG5nxnE2xhgKyqrILa4gKbOAdSk57DpSQE5xBXklFVRXG2zGUFxRzchu4cwY05X/+2YfM95aD0DHEH8emdibWRfGERbo6+Jvo5RS5+bsM4CRQLIxJgVARBYCU4DzJoBqmyE9t4TckopTyiurbKw9kMPXO4+SfKyoXkEYY5Bvvjyj3GYMxvz4vo2vNxd0DmNAdCjtAv3w9bZfMukfHcoNQzvj5SVM7BfJRxvT6Rjiz4R+kTV1lFLK3Tk7AXQG0mu9zwBGna3yvqxCLnt+FdU2Q8aJUiqqbHXWE4ERceHMvqQ73nL+UbcOHTpE165d61xPWBtfwoP8iGsfxAWdw867Qw/w9WbGmLjzfqZSSrkbMbUPeR39YSJTgSuMMfdY76cDI40xD9aqMxuYDRAUGRd/5aOvIAIRAV50DhbC/E/dwQsQG+pFW//6H3kXFRURHBzc9C/kAhq7a2jsrqGxN8748eM3GWOGn7eiMcZpL2AMsLTW+8eAx85WPz4+3jjCypUrHbJeZ9DYXUNjdw2NvXGAjaYe+2RnN1hvAHqJSDcR8QNuBRY7OQallFI4+RqAMaZKRB4AlmK/DfQtY8wuZ8aglFLKzunPARhjvgTOvAVHKaWUU+k9i0op5aE0ASillIfSBKCUUh5KE4BSSnkoTQBKKeWhnPokcEOJyDHgkANW3R447oD1OoPG7hoau2to7I3T1RjT4XyV3DoBOIqIbDT1eUzaDWnsrqGxu4bG7ljaBKSUUh5KE4BSSnkoT00Ar7s6gCbQ2F1DY3cNjd2BPPIagFJKKc89A1BKKY/XKhKAiLwlItkisrNW2RARSRSRrSKyUURGWuVhIvI/EdkmIrtE5M5ay8wUkf3Wa6YLYx8sIj+IyA4r1tBa8x4TkWQR2SsiV9Qqv9IqSxaRue4Wu4hcJiKbrPJNInJprWXirfJkEXlJpB7Dujk5/lrzu4hIkYj8qlaZW297a94ga94ua36AVe70bd/AvxtfEVlglSeJyGO1lnHFdo8VkZVWLLtE5CGrPFxEvrX2Hd+KSDurXKztmiwi20VkWK11OX1/c4b6DBrg7i/gEmAYsLNW2TfAJGv6KiDBmv4t8HdrugOQC/gB4UCK9bOdNd3ORbFvAH5iTd8F/MWa7g9sA/yBbsAB7N1qe1vT3a3vsg3o72axDwWiremBwOFay6zHPliQAF+d/L25U/y15n8C/Bf4lfW+JWx7H2A7MNh6HwF4u2rbNzD224CF1nQgkArEuXC7RwHDrOkQYJ/1f/kMMNcqn8uP+5irrO0qwGhgnVXukv3N6a9WcQZgjPkO+478lGLg5BFQGHCkVnmIdaQTbC1XBVwBfGuMyTXG5AHfAle6KPY+wHfW9LfAjdb0FOz/DOXGmINAMjDSeiUbY1KMMRXAQquu28RujNlijDn5O9gFBIiIv4hEAaHGmB+M/T/jHeA6R8duxdSQbY+IXIf9H7X2GBZuv+2By4Htxpht1rI5xphqV237BsZugCAR8QHaABVAAa7b7pnGmM3WdCGQhH2s8ynAAqvaAn7cjlOAd4xdItDW2u4u2d+crlUkgLN4GHhWRNKB57APPwnwMtAPe0LYATxkjLFR94D1nZ0X7il2Atda01OBWGv6bDG2hNhruxHYYowpxx5nRq15rowdzhK/iAQBjwJ/Oq1+S9j2vQEjIktFZLOI/MYqd6dtf7bYPwaKgUwgDXjOGJOLG2x3EYnDfma7Dog0xmSCPUkAHa1qbv0/25oTwL3AI8aYWOARYJ5VfgWwFYgGhgAvW+2NdbV9uuoWqbuA+0VkE/bTzAqr/GwxtoTYARCRAcDfgZ+dLKpjHa68Ne1s8f8JeMEYU3RafXeK/2yx+wAXAbdbP68XkQm0jNhHAtXY/1+7Ab8Uke64OHYRCcbeHPiwMabgXFXrKHOb/1mnjwjmRDOBh6zp/wJvWtN3Ak9bp7zJInIQ6Is9A4+rtXwMkOCUSE9jjNmD/bQdEekNXG3NyuDUI+oYfmzaOlu5U50jdkQkBvgMmGGMOWAVZ2CP9ySXxQ7njH8UcJOIPAO0BWwiUgZswv23fQawyhhz3Jr3JfY2+Hdxk21/jthvA742xlQC2SKyBhiO/ejZJdtdRHyx7/zfM8Z8ahVniUiUMSbTauLJtsrP9j/rFvub1nwGcAT4iTV9KbDfmk4DJgCISCT2tscU7OMUXy4i7awr+JdbZU4nIh2tn17A74FXrVmLgVuttvNuQC/sF/E2AL1EpJuI+AG3WnWd7myxi0hb4AvgMWPMmpP1rdPlQhEZbV2XmQEscnrglrPFb4y52BgTZ4yJA/4B/M0Y8zItYNtj/zseJCKBVlv6T4Dd7rTtzxF7GnCpdTdNEPYLqXtw0Xa3ttM8IMkY83ytWYuxH3Ri/VxUq3yGFf9oIN/a7u6xv3H2VWdHvIAPsLcRVmLPrHdjP9XdhP3ugHVAvFU3GvsdQjuwtzveUWs9d2G/sJoM3OnC2B/CfnfBPuBprAf2rPq/w373w15q3bGB/W6Dfda837lb7Nj/qYuxN7+dfHW05g23fhcHsF+jEXeL/7Tl/oh1F1BL2PZW/TuwX7zeCTxTq9zp276BfzfB2M/gdwG7gV+7eLtfhL2pZnutv+OrsN9ZtRz7geZyINyqL8C/rBh3AMNrrcvp+5vTX/oksFJKeajW3ASklFLqHDQBKKWUh9IEoJRSHkoTgFJKeShNAEop5aE0ASillIfSBKCUg4mIt6tjUKoumgCUqkVE/nKyj3fr/V9FZI6I/FpENlh9uv+p1vzPxT6+wS4RmV2rvEhE/iwi67B3t6yU29EEoNSp5mE90m91S3ArkIW9242R2DsQjBeRS6z6dxlj4rE/UTtHRCKs8iDs/d2PMsasduYXUKq+WnNncEo1mDEmVURyRGQoEAlsAUZg76tli1UtGHtC+A77Tv96qzzWKs/B3oPlJ86MXamG0gSg1JneBGYBnYC3sHce+JQx5rXalURkHDARGGOMKRGRBCDAml1mjKl2VsBKNYY2ASl1ps+wj840AnsPjUuBu6w+4BGRzlbvlWFAnrXz74u9p0qlWgw9A1DqNMaYChFZCZywjuK/EZF+wA/23oApwt675tfAz0VkO/beWRNdFbNSjaG9gSp1Guvi72ZgqjFm//nqK9VSaROQUrWISH/s/bMv152/au30DEAppTyUngEopZSH0gSglFIeShOAUkp5KE0ASinloTQBKKWUh9IEoJRSHur/Adaz/PM757akAAAAAElFTkSuQmCC\n",
      "text/plain": [
       "<matplotlib.figure.Figure at 0x7f8ebc9e4ba8>"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    }
   ],
   "source": [
    "import pandas as pd    \n",
    "A=counts.toPandas() # Transform a spark Dataframe to a Pandas Dataframe\n",
    "A.plot.line('year','count')\n",
    "grid()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "slideshow": {
     "slide_type": "subslide"
    }
   },
   "source": [
    "### Reading rows selectively\n",
    "Suppose we are only interested in snow measurements. We can apply an SQL query directly to the \n",
    "parquet files. As the data is organized in columnar structure, we can do the selection efficiently without loading the whole file to memory.\n",
    "\n",
    "Here the file is small, but in real applications it can consist of hundreds of millions of records. In such cases loading the data first to memory and then filtering it is very wasteful."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 21,
   "metadata": {
    "scrolled": true,
    "slideshow": {
     "slide_type": "subslide"
    }
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "SELECT station,measurement,year \n",
      "FROM parquet.`/home/jovyan/work/Sections/Data/Weather/NY.parquet` \n",
      "WHERE measurement=\"SNOW\" \n",
      "15629 ['station', 'measurement', 'year']\n",
      "+-----------+-----------+----+\n",
      "|    station|measurement|year|\n",
      "+-----------+-----------+----+\n",
      "|USC00303452|       SNOW|1903|\n",
      "|USC00303452|       SNOW|1904|\n",
      "|USC00303452|       SNOW|1905|\n",
      "|USC00303452|       SNOW|1906|\n",
      "|USC00303452|       SNOW|1907|\n",
      "+-----------+-----------+----+\n",
      "only showing top 5 rows\n",
      "\n"
     ]
    }
   ],
   "source": [
    "data_dir='../../Data'\n",
    "query=\"\"\"SELECT station,measurement,year \n",
    "FROM parquet.`%s.parquet` \n",
    "WHERE measurement=\\\"SNOW\\\" \"\"\"%(weather_dir+'/'+zip_file[:-4])\n",
    "print(query)\n",
    "df2 = sqlContext.sql(query)\n",
    "print(df2.count(),df2.columns)\n",
    "df2.show(5)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "slideshow": {
     "slide_type": "slide"
    }
   },
   "source": [
    "## Summary\n",
    "\n",
    "* Dataframes can be manipulated decleratively, which allows for more optimization.\n",
    "* Dataframes can be stored and retrieved from Parquet files.\n",
    "* It is possible to refer directly to a parquet file in an SQL query.\n",
    "* See you next time!"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "slideshow": {
     "slide_type": "skip"
    }
   },
   "source": [
    "## References\n",
    "* For an introduction to Spark SQL and Dataframes see: [Spark SQL, DataFrames](https://spark.apache.org/docs/latest/sql-programming-guide.html#spark-sql-dataframes-and-datasets-guide)\n",
    "* Also [spark-dataframe-and-operations](https://www.analyticsvidhya.com/blog/2016/10/spark-dataframe-and-operations/) from [analyticsvidhya.com](https://www.analyticsvidhya.com)\n",
    "\n",
    "For complete API reference see\n",
    "* [SQL programming guide](https://spark.apache.org/docs/latest/sql-programming-guide.html) For Java, Scala and Python (Implementation is first in Scala and Python, later pyspark)\n",
    "* [pyspark API for the DataFrame class](http://spark.apache.org/docs/latest/api/python/pyspark.sql.html#pyspark.sql.DataFrame) \n",
    "* [pyspark API for the pyspark.sql module](http://spark.apache.org/docs/latest/api/python/pyspark.sql.html#pyspark-sql-module)\n"
   ]
  }
 ],
 "metadata": {
  "anaconda-cloud": {},
  "celltoolbar": "Slideshow",
  "hide_input": false,
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.6.3"
  },
  "toc": {
   "base_numbering": 1,
   "nav_menu": {
    "height": "263px",
    "width": "252px"
   },
   "number_sections": true,
   "sideBar": true,
   "skip_h1_title": false,
   "title_cell": "Table of Contents",
   "title_sidebar": "Contents",
   "toc_cell": false,
   "toc_position": {},
   "toc_section_display": "block",
   "toc_window_display": false
  },
  "varInspector": {
   "cols": {
    "lenName": 16,
    "lenType": 16,
    "lenVar": 40
   },
   "kernels_config": {
    "python": {
     "delete_cmd_postfix": "",
     "delete_cmd_prefix": "del ",
     "library": "var_list.py",
     "varRefreshCmd": "print(var_dic_list())"
    },
    "r": {
     "delete_cmd_postfix": ") ",
     "delete_cmd_prefix": "rm(",
     "library": "var_list.r",
     "varRefreshCmd": "cat(var_dic_list()) "
    }
   },
   "types_to_exclude": [
    "module",
    "function",
    "builtin_function_or_method",
    "instance",
    "_Feature"
   ],
   "window_display": false
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
